{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1.查看数据集整体"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "files = os.listdir(\"ml-latest-small\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "datalist = []\n",
    "for file in files:\n",
    "    name = re.findall('(.*)\\.(.*)',file)\n",
    "    if name[0][1] == 'csv':\n",
    "        datalist.append(file)\n",
    "files = datalist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['links.csv', 'tags.csv', 'ratings.csv', 'movies.csv']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files\n",
    "#一共有4个csv文件"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2.分析4个数据集的具体信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"ml-latest-small/{}\".format(files[0])\n",
    "links = pd.read_csv(path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(9742, 3)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "links.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 9742 entries, 0 to 9741\n",
      "Data columns (total 3 columns):\n",
      "movieId    9742 non-null int64\n",
      "imdbId     9742 non-null int64\n",
      "tmdbId     9734 non-null float64\n",
      "dtypes: float64(1), int64(2)\n",
      "memory usage: 228.4 KB\n"
     ]
    }
   ],
   "source": [
    "links.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>imdbId</th>\n",
       "      <th>tmdbId</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>114709</td>\n",
       "      <td>862.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>113497</td>\n",
       "      <td>8844.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>113228</td>\n",
       "      <td>15602.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>114885</td>\n",
       "      <td>31357.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>113041</td>\n",
       "      <td>11862.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movieId  imdbId   tmdbId\n",
       "0        1  114709    862.0\n",
       "1        2  113497   8844.0\n",
       "2        3  113228  15602.0\n",
       "3        4  114885  31357.0\n",
       "4        5  113041  11862.0"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "links.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**links.csv文件内容：**  \n",
    "对于links.csv文件来说，一共是三列特征：   \n",
    "* moviedId:为https://movielens.org 网站使用的电影的标识符  \n",
    "* imdbld:为http://www.imdb.com 网站使用的电影标识符\n",
    "* tmdbld:为https://www.themoviedb.org 网站使用的电影标识符号\n",
    "\n",
    "以movieId=1为例\n",
    "可以得到：  \n",
    "在movielens中的网站信息为https://movielens.org/movies/1  \n",
    "在imdbld中的网站信息为http://www.imdb.com/title/tt0114709/  \n",
    "在tmdbld中的网站信息为https://www.themoviedb.org/movie/862  \n",
    "\n",
    "**links.csv文件规模:**  \n",
    "含有9742部电影的信息  \n",
    "其中imdbid存在8个缺失值\n",
    "\n",
    "**links.csv文件应用:**  \n",
    "关于这个文件，目前想到的就是可以得到对应网站的相关介绍"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"ml-latest-small/{}\".format(files[1])\n",
    "tags = pd.read_csv(path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 3683 entries, 0 to 3682\n",
      "Data columns (total 4 columns):\n",
      "userId       3683 non-null int64\n",
      "movieId      3683 non-null int64\n",
      "tag          3683 non-null object\n",
      "timestamp    3683 non-null int64\n",
      "dtypes: int64(3), object(1)\n",
      "memory usage: 115.2+ KB\n"
     ]
    }
   ],
   "source": [
    "tags.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3683, 4)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>tag</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>60756</td>\n",
       "      <td>funny</td>\n",
       "      <td>1445714994</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>60756</td>\n",
       "      <td>Highly quotable</td>\n",
       "      <td>1445714996</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>60756</td>\n",
       "      <td>will ferrell</td>\n",
       "      <td>1445714992</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>89774</td>\n",
       "      <td>Boxing story</td>\n",
       "      <td>1445715207</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>89774</td>\n",
       "      <td>MMA</td>\n",
       "      <td>1445715200</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId              tag   timestamp\n",
       "0       2    60756            funny  1445714994\n",
       "1       2    60756  Highly quotable  1445714996\n",
       "2       2    60756     will ferrell  1445714992\n",
       "3       2    89774     Boxing story  1445715207\n",
       "4       2    89774              MMA  1445715200"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1572"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#求打过标签的电影数目\n",
    "len(tags.movieId.unique())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**tags文件内容**  \n",
    "包含4个特征  \n",
    "* userId:用户id\n",
    "* movieId:movielens中的id\n",
    "* tag:用户打的元标签\n",
    "* timestamp:打标签的时间戳\n",
    "\n",
    "**tags文件规模**  \n",
    "一共有3683条标签记录  \n",
    "针对1572部电影打了标签，打标签的电影占电影总数的16%\n",
    "\n",
    "**tags文件应用**   \n",
    "可以用户打标签的习惯，比如倾向于对一部电影打几个标签  \n",
    "提炼热门电影的关键词  \n",
    "用于电影类型的分类  \n",
    "衡量电影的热门程度  \n",
    "打标签与打分数之间的关联"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"ml-latest-small/{}\".format(files[2])\n",
    "ratings = pd.read_csv(path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 100836 entries, 0 to 100835\n",
      "Data columns (total 4 columns):\n",
      "userId       100836 non-null int64\n",
      "movieId      100836 non-null int64\n",
      "rating       100836 non-null float64\n",
      "timestamp    100836 non-null int64\n",
      "dtypes: float64(1), int64(3)\n",
      "memory usage: 3.1 MB\n"
     ]
    }
   ],
   "source": [
    "ratings.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4.0</td>\n",
       "      <td>964982703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>4.0</td>\n",
       "      <td>964981247</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>4.0</td>\n",
       "      <td>964982224</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>47</td>\n",
       "      <td>5.0</td>\n",
       "      <td>964983815</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>50</td>\n",
       "      <td>5.0</td>\n",
       "      <td>964982931</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId  rating  timestamp\n",
       "0       1        1     4.0  964982703\n",
       "1       1        3     4.0  964981247\n",
       "2       1        6     4.0  964982224\n",
       "3       1       47     5.0  964983815\n",
       "4       1       50     5.0  964982931"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9724"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(ratings.movieId.unique())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**ratings文件内容**  \n",
    "一共包含4个特征  \n",
    "* userId:用户id\n",
    "* moviedId:电影id\n",
    "* rating:用户评分，评分以5星级为标准，以半星级递增\n",
    "* timestamp:评分时间戳  \n",
    "\n",
    "**ratings文件规模**  \n",
    "一共包含100836条评分记录9742部电影的评分，有评分的电影占总电影数的99.8%\n",
    "\n",
    "**ratings文件应用**  \n",
    "可以基于用户的评分协同过滤进行电影推荐  \n",
    "根据电影和类型评估哪类电影更容易得到高分  \n",
    "评分时间是否和评分等级有关系  \n",
    "衡量热门评分电影 \n",
    "衡量高评分电影"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"ml-latest-small/{}\".format(files[3])\n",
    "movies = pd.read_csv(path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 9742 entries, 0 to 9741\n",
      "Data columns (total 3 columns):\n",
      "movieId    9742 non-null int64\n",
      "title      9742 non-null object\n",
      "genres     9742 non-null object\n",
      "dtypes: int64(1), object(2)\n",
      "memory usage: 228.4+ KB\n"
     ]
    }
   ],
   "source": [
    "movies.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Jumanji (1995)</td>\n",
       "      <td>Adventure|Children|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Grumpier Old Men (1995)</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Waiting to Exhale (1995)</td>\n",
       "      <td>Comedy|Drama|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>Father of the Bride Part II (1995)</td>\n",
       "      <td>Comedy</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movieId                               title  \\\n",
       "0        1                    Toy Story (1995)   \n",
       "1        2                      Jumanji (1995)   \n",
       "2        3             Grumpier Old Men (1995)   \n",
       "3        4            Waiting to Exhale (1995)   \n",
       "4        5  Father of the Bride Part II (1995)   \n",
       "\n",
       "                                        genres  \n",
       "0  Adventure|Animation|Children|Comedy|Fantasy  \n",
       "1                   Adventure|Children|Fantasy  \n",
       "2                               Comedy|Romance  \n",
       "3                         Comedy|Drama|Romance  \n",
       "4                                       Comedy  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**movies文件内容**  \n",
    "一共包含3个特征  \n",
    "* movieId:电影在MovieLens网站上使用的电影ID\n",
    "* title:电影名称，并且包含上映时间\n",
    "* genres:电影类型  \n",
    "\n",
    "**movies数据规模**  \n",
    "共包含9742电影的信息,一共是20种类别，比数据说明文件中多IMAX这个类别\n",
    " \n",
    "**movies数据应用**  \n",
    "可以分析下每年出品的电影数目，以及细分到类型  \n",
    "每种类型的电影的数目  \n",
    "用于电影相关度的衡量  \n",
    "电影类型的关联度，比如恐怖片往往是动作片  \n",
    "IMAX的电影个数和电影比例"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "为了便于进行电影类型的分析，将genres拆开"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "movies_clean = pd.DataFrame()\n",
    "for index in range(movies.shape[0]):\n",
    "    line = movies.iloc[index,:]\n",
    "    figures = line.genres\n",
    "    figlist = re.findall(\"[\\w -]+\",figures)\n",
    "    for figure in figlist:\n",
    "        newrow = pd.DataFrame({\"movieId\":[line.movieId],\"title\":[line.title],\"genres\":[figure]})\n",
    "        movies_clean = pd.concat([movies_clean,newrow])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Adventure</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Animation</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Children</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Comedy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Fantasy</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movieId             title     genres\n",
       "0        1  Toy Story (1995)  Adventure\n",
       "0        1  Toy Story (1995)  Animation\n",
       "0        1  Toy Story (1995)   Children\n",
       "0        1  Toy Story (1995)     Comedy\n",
       "0        1  Toy Story (1995)    Fantasy"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies_clean.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',\n",
       "       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',\n",
       "       'Mystery', 'Sci-Fi', 'War', 'Musical', 'Documentary', 'IMAX',\n",
       "       'Western', 'Film-Noir', 'no genres listed'], dtype=object)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies_clean.genres.unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3.将清洗后的数据进行保存"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {},
   "outputs": [],
   "source": [
    "links.to_csv('cleaned_data/links.csv',index=False)\n",
    "\n",
    "movies_clean.to_csv('cleaned_data/movies.csv',index=False)\n",
    "\n",
    "ratings.to_csv('cleaned_data/ratings.csv',index=False)\n",
    "\n",
    "tags.to_csv('cleaned_data/tags.csv',index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
